CS765 - Information Retrival project¶
import re
import os
import nltk
import collections
import numpy as np
def load_allfiles():
# load all files from gutenberg
file_names = nltk.corpus.gutenberg.fileids()
return [nltk.corpus.gutenberg.words(file) for file in file_names]
# load data
filesList = load_allfiles()
print(len(filesList), "text documents\n")
file_names = nltk.corpus.gutenberg.fileids()
print("\nfile names are:",file_names)
18 text documents file names are: ['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
Get Document titles, overview & content¶
document titles, overviews and content will be added to the database filelds.
# get document titles
def getDocTitles():
return [title.split('.')[0] for title in nltk.corpus.gutenberg.fileids()]
# get document overviews
def getOverviews(fileNames, fileDirectory):
overviews = []
for file_name in fileNames:
file_path = os.path.join(fileDirectory, file_name)
if os.path.isfile(file_path):
try:
with open(file_path, "r", encoding='latin-1') as file:
content = file.read(200)
overviews.append(content)
except Exception as e:
print(f"Error reading {file_name}: {e}")
continue
return overviews
# get document contents
def getContents(fileNames, fileDirectory):
content_list = []
for file_name in fileNames:
file_path = os.path.join(fileDirectory, file_name)
if os.path.isfile(file_path):
try:
with open(file_path, "r", encoding='latin-1') as file:
content = file.read()
content_list.append(content)
except Exception as e:
print(f"Error reading {file_name}: {e}")
continue
return content_list
# Example usage:
file_names_list = ['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
directory_name = 'gutenberg'
titles = getDocTitles()
overviews = getOverviews(file_names_list, directory_name)
contents = getContents(file_names_list, directory_name)
print(f"There are {len(titles)} titles {len(overviews)} overviews and {len(contents)} contents")
print(f"\ntitle of the first file is:\n\n {titles[0]}")
print(f"\noverview of the first file is:\n\n {overviews[0]}")
#print(f"\ncontent of the first file is:\n\n {contents[0]}")
There are 18 titles 18 overviews and 18 contents title of the first file is: austen-emma overview of the first file is: [Emma by Jane Austen 1816] VOLUME I CHAPTER I Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to unite some of the best blessings of existence; and h
2. Text Preprocessing¶
To build a term-document matrix, which is going to be used to for querying documents, first we need to pre-process text.
Tokenisation¶
Already done when reading files on the first step, read files as tokens.
Casefolding¶
Casefolding tries to create a common case for all documents. Basically, there are two ways to do casefolding:
1. tolower() - which simply translates all capital case letters to lower case
2. casefold() - python method for strings that translates common letters ( used for this project)
def casefold_Text(text_):
return [word.casefold() for word in text_]
# cleaning special characters
def normalize_Text(tokens):
pattern = r"[^a-zA-Z\s]"
clean_tokens = [re.sub(pattern, '', token) for token in tokens]
clean_tokens = list(filter(None, clean_tokens))
return clean_tokens
# select words that are not in the stopword list
def filterStopWords_Text(text_):
wordlist = nltk.corpus.stopwords.words("english")
result = []
for word in text_:
if not word in wordlist:
result.append(word)
return result
# stems all words in a file
def stem_Text(text_):
ps = nltk.stem.PorterStemmer()
return [ps.stem(token) for token in text_]
def clean_Text(text_):
text_ = casefold_Text(text_)
# remove special characters
text_ = normalize_Text(text_)
text_ = filterStopWords_Text(text_)
text_ = stem_Text(text_)
return text_
def casefold_Text(text_):
return [word.casefold() for word in text_]
# Test
testText = "This is a Test TexT that has to be cleaned 155615 [ ]] ROBERT ;) DROP TABLE Students;"
# casefolding
tokensList = testText.split()
cf_tokensList = casefold_Text(tokensList)
print("before casefolding:", testText)
print("after casefolding:", ' '.join(cf_tokensList))
before casefolding: This is a Test TexT that has to be cleaned 155615 [ ]] ROBERT ;) DROP TABLE Students; after casefolding: this is a test text that has to be cleaned 155615 [ ]] robert ;) drop table students;
Normalisation of text¶
- assuming special characters are not helpful for the query, we need to remove them.
- remove special characters that do not contribute much to the search
- we can use a regular expression
reto remove texts that have specific patterns. - we can easily define regular expression formulas to remove specific set of characters from text.
# normalizing - rmove special characters
# cleaning special characters
def normalize_Text(tokens):
pattern = r"[^a-zA-Z\s]"
clean_tokens = [re.sub(pattern, '', token) for token in tokens]
clean_tokens = list(filter(None, clean_tokens))
return clean_tokens
# Test text
testText = "This is a Test TexT that has to be cleaned 155615 [ ]] ROBERT ;) DROP TABLE Students;"
norm_testText = normalize_Text(testText.split())
print("before normalize_Text:", testText)
print("after normalize_Text:", ' '.join(norm_testText))
before normalize_Text: This is a Test TexT that has to be cleaned 155615 [ ]] ROBERT ;) DROP TABLE Students; after normalize_Text: This is a Test TexT that has to be cleaned ROBERT DROP TABLE Students
Stop word from a text¶
- just like stop words assume stopwords like 'the', 'is', 'are', ...etc. do not have semantic value for for querying documents
- hence remove stop words as well
- a collection of stop words,for different languages, can be downloaded from nltk
- removing the stop words helps to speed up our query
Zip's lawshows that most of the words in a text are stop words
# select words that are not in the stopword list
def filterStopWords_Text(text_):
wordlist = nltk.corpus.stopwords.words("english")
result = []
for word in text_:
if not word in wordlist:
result.append(word)
return result
# Test text
testText = "This and that can happen at any time of the day"
filterStop_testText = filterStopWords_Text(testText.split())
print("before stopword removal:", testText)
print("after stopword removal:", ' '.join(filterStop_testText))
before stopword removal: This and that can happen at any time of the day after stopword removal: This happen time day
StopWords¶
stopWordlist = nltk.corpus.stopwords.words("english")
print("number of stop words in nltk: ", len(stopWordlist))
print(stopWordlist[:10])
print(stopWordlist[-10:-1])
number of stop words in nltk: 179 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"] ['shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn']
Stemming text¶
- stemming keeps the root of the word
- for the serach query a word and its variation should not make much difference
- e.g. the words:
Test,testing,testedwill just be chnaged to totest, but the frequency is kept - there are different kinds of stemmers: we an use porter's stemmer
# stems all words in a file
def stem_Text(text_):
ps = nltk.stem.PorterStemmer()
stem = [ps.stem(token) for token in text_]
return stem
# Test text
testText = "running and runners will run"
# stemming
stem_testText = stem_Text(testText.split())
print("before stemming:", testText)
print("after stemming:", ' '.join(stem_testText))
before stemming: running and runners will run after stemming: run and runner will run
Cleaning text¶
- applying all the above cleaning functions to the text at once.
def clean_Text(text_):
text_ = casefold_Text(text_)
text_ = normalize_Text(text_)
text_ = filterStopWords_Text(text_)
text_ = stem_Text(text_)
return text_
testText = "This is a Test TexT that has to be cleaned 155615 [ ]] ROBERT ;) DROP TABLE Students;"
clean_text = clean_Text(testText.split())
print("nbefore cleaning:", testText)
print("after cleaning:", ' '.join(clean_text))
nbefore cleaning: This is a Test TexT that has to be cleaned 155615 [ ]] ROBERT ;) DROP TABLE Students; after cleaning: test text clean robert drop tabl student
Pre-processing entire files¶
using the above list of functions, we can now clean and prepare the texts
casefold_Files()
normalize_Files()
remove_stopwords_Files ()
stem_Files()
we can the entire files using clean_Files() that calls the above functions.
def casefold_Files(files_):
return [casefold_Text(file) for file in files_]
# clean special characters
def normalize_Files(files_):
# remove special characters
return [normalize_Text(file) for file in files_]
def remove_stopwords_Files(files_):
return [filterStopWords_Text(file) for file in files_]
# stems all words in a file
def stem_Files(files_):
return [stem_Text(file) for file in files_]
def clean_Files(files_):
files_ = casefold_Files(files_)
files_ = normalize_Files(files_)
files_ = remove_stopwords_Files(files_)
cleanFiles = stem_Files(files_)
return cleanFiles
Comparison of files before and after cleaning¶
- there are 18 files
- each file is a list of tokens
- allFiles_ contain a list of files which in turn contain a list of tokens
allFiles_ = load_allfiles()
for file in allFiles_:
print("list of all tokens in the first file:", len(file))
# list of unique tokens
uniqueTokens = collections.Counter(file)
print("list of unique tokens in the first file:", len(uniqueTokens))
break
list of all tokens in the first file: 192427 list of unique tokens in the first file: 7811
# We use pandas here as it provides some nice functionality for showing tables
import pandas as pd
import collections
def addRow(allFiles_):
"""
allFiles_ is a list files
each file is a list of tokens
"""
row_list = []
for file in allFiles_:
# list of all tokens in a file
all_tokens_count = len(file)
# list of all unique tokens in the file
unique_tokens_count = len(collections.Counter(file))
ratio = unique_tokens_count/all_tokens_count*100
rounded_ratio = round(ratio, 2)
row_string = "%s " % (rounded_ratio)
row_list.append(row_string)
return row_list
# Header of the table
df = pd.DataFrame(columns=nltk.corpus.gutenberg.fileids())
# load the corpus and report number words and unique number of words
allFiles = load_allfiles()
df.loc["before cleaning"] = addRow(allFiles)
# casefold all words and report statistics
allFiles_case = casefold_Files(allFiles)
df.loc["after Casefold"] = addRow(allFiles_case)
allFiles_case_norm = normalize_Files(allFiles_case)
df.loc["after normalization"] = addRow(allFiles_case_norm)
allFiles_case_norm_stp = remove_stopwords_Files(allFiles_case_norm)
df.loc["after stopwords removed"] = addRow(allFiles_case_norm_stp)
allFiles_case_norm_stp_stm = stem_Files(allFiles_case_norm_stp)
df.loc["after stemming"] = addRow(allFiles_case_norm_stp_stm)
allFiles = load_allfiles()
allFiles_clean = clean_Files(allFiles)
df.loc["clean at once"] = addRow(allFiles_clean)
print("The ratio in percent of unique tokens to the total number of tokens in each file: \n")
df
The ratio in percent of unique tokens to the total number of tokens in each file:
| austen-emma.txt | austen-persuasion.txt | austen-sense.txt | bible-kjv.txt | blake-poems.txt | bryant-stories.txt | burgess-busterbrown.txt | carroll-alice.txt | chesterton-ball.txt | chesterton-brown.txt | chesterton-thursday.txt | edgeworth-parents.txt | melville-moby_dick.txt | milton-paradise.txt | shakespeare-caesar.txt | shakespeare-hamlet.txt | shakespeare-macbeth.txt | whitman-leaves.txt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| before cleaning | 4.06 | 6.25 | 4.83 | 1.36 | 21.79 | 7.95 | 9.3 | 8.84 | 9.22 | 9.64 | 9.83 | 4.55 | 7.41 | 11.1 | 13.78 | 14.58 | 17.36 | 9.25 |
| after Casefold | 3.82 | 5.94 | 4.52 | 1.26 | 18.37 | 7.09 | 8.22 | 7.73 | 8.59 | 9.06 | 9.17 | 4.01 | 6.61 | 9.32 | 11.74 | 12.62 | 14.97 | 8.04 |
| after normalization | 4.38 | 6.82 | 5.2 | 1.59 | 21.78 | 8.21 | 9.3 | 9.4 | 9.88 | 10.56 | 10.73 | 4.87 | 7.76 | 11.15 | 14.49 | 15.53 | 18.86 | 9.76 |
| after stopwords removed | 9.5 | 14.62 | 11.39 | 3.32 | 36.79 | 16.98 | 18.21 | 19.8 | 20.18 | 21.47 | 21.74 | 10.45 | 15.21 | 19.42 | 26.33 | 28.87 | 33.14 | 18.73 |
| after stemming | 5.96 | 9.52 | 7.28 | 2.46 | 30.01 | 12.88 | 14.76 | 14.97 | 14.17 | 15.37 | 15.79 | 6.97 | 9.53 | 13.52 | 21.3 | 22.96 | 27.23 | 12.48 |
| clean at once | 5.96 | 9.52 | 7.28 | 2.46 | 30.01 | 12.88 | 14.76 | 14.97 | 14.17 | 15.37 | 15.79 | 6.97 | 9.53 | 13.52 | 21.3 | 22.96 | 27.23 | 12.48 |
3. Creating Term-Document Matrix from the documents¶
Term-Document Matrix¶
The Term-Document-Matrix allows to store the frequency of each term in each document.
The Term-Document Matrix ist created by first getting the number of unique words among the entire corpus and then collecting the frequencies of those in the respective documents.
The function is then applied to the small text corpus.
summary of facts about files¶
# Run files summary on each document
def summarize_Files(files_):
counters = [collections.Counter(file) for file in files_]
# create empty counter
counter_tmp = collections.Counter()
# iterate counter and sum up
for c in counters:
counter_tmp += c
return (counter_tmp, counters)
allFiles = load_allfiles()
allFiles_clean = clean_Files(allFiles)
counter_allFiles_clean, counters_documents = summarize_Files(allFiles_clean)
print("The number of tokens in all files is ", sum(counter_allFiles_clean.values()), "tokens and", len(counter_allFiles_clean), "terms.")
print("The dimension of the respective TDM is", len(allFiles), "files x", len(counter_allFiles_clean), "terms in all files")
The number of tokens in all files is 1025767 tokens and 25476 terms. The dimension of the respective TDM is 18 files x 25476 terms in all files
Zipf's Law - distribution of terms in files¶
import plotly.express as px
import pandas as pd
# count all unique terms in all files and put into a dict
allFiles = load_allfiles()
counter_allFiles_unclean, counters_documents = summarize_Files(allFiles)
dict_ = counter_allFiles_unclean
# Sort the dictionary
sorted_dict = dict(sorted(dict_.items(), key=lambda item: item[1], reverse=True))
df = pd.DataFrame(list(sorted_dict.items()), columns=['Word', 'Frequency'])
# Create a bubble plot using Plotly Express
fig = px.scatter(df, x=df.index, y='Frequency', size='Frequency',
color='Frequency', opacity=0.9,
labels={'index': "Index of term in the sorted dictionary", 'Frequency': "Frequency of terms"},
template='plotly_white')
# Annotate the top 5 most frequent terms
top_5_terms = df.head(5)
for i, term in enumerate(top_5_terms['Word']):
fig.add_annotation(
x=top_5_terms.index[i],
y=top_5_terms['Frequency'].iloc[i],
text=term,
showarrow=False,
arrowhead=3,
ax=0,
ay=-30
)
# Add grid lines
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='grey', zeroline=False)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='grey')
# Set plot background color
fig.update_layout(paper_bgcolor='lightblue', plot_bgcolor='lightblue')
# Center the title
fig.update_layout(title_text="Zipf's Law - distribution of terms before cleaning", title_x=0.5)
fig.update_traces(marker=dict(line=dict(color='black', width=1)))
# Show the plot
fig.show()
import plotly.express as px
import pandas as pd
# read all files and clean it
allFiles = load_allfiles()
allFiles_clean = clean_Files(allFiles)
# count terms in files
counter_allFiles_clean, counters_documents = summarize_Files(allFiles_clean)
# frequency of terms after cleaning
dict_ = counter_allFiles_clean
# Sorting the dictionary
sorted_dict = dict(sorted(dict_.items(), key=lambda item: item[1], reverse=True))
# Create a DataFrame
df = pd.DataFrame(list(sorted_dict.items()), columns=['Word', 'Frequency'])
# Create a bubble plot using Plotly Express
fig = px.scatter(df, x=df.index, y='Frequency', size='Frequency',
color='Frequency', opacity=0.9,
labels={'index': "Index of term in the sorted dictionary", 'Frequency': "Frequency of terms"},
template='plotly_white')
# Annotate the top 5 most frequent terms
top_5_terms = df.head(5)
for i, term in enumerate(top_5_terms['Word']):
fig.add_annotation(
x=top_5_terms.index[i],
y=top_5_terms['Frequency'].iloc[i],
text=term,
showarrow=False,
arrowhead=3,
ax=0,
ay=-30
)
# Add grid lines
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='grey', zeroline=False)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='grey')
# Set plot background color
fig.update_layout(paper_bgcolor='lightblue', plot_bgcolor='lightblue')
# Center the title
fig.update_layout(title_text="Zipf's Law - distribution of terms after cleaning", title_x=0.5)
fig.update_traces(marker=dict(line=dict(color='black', width=1)))
# Show the plot
fig.show()
Creating term-document-matrix¶
import numpy as np
def create_term_document_matrix(all_files_):
"""
takes: collection of all clean documents
retuns: term-document matrix and all terms
"""
# get the summary of the corpus
counter_corpus, counter_documents = summarize_Files(all_files_)
# create an empty matrix with the correct dimension
tdm = np.zeros((len(counter_corpus), len(all_files_)))
for idx, word in enumerate(counter_corpus):
for document_id in range(len(all_files_)):
if word in counter_documents[document_id]:
tdm[idx, document_id] = counter_documents[document_id][word]
return (tdm, list(counter_corpus.keys()))
In the resulting Term-Document Matrix it can be seen which words are contained in with documents. For example: The word first is present in Document1 only while the word This is present both documents.
Based on the the above demo, we can create term-document matrix for the entire corpus in the project gutenberg ( 18 documents)
# create the TDM from the corpus
allFiles = load_allfiles()
allFiles_clean = clean_Files(allFiles)
tdm, terms = create_term_document_matrix(allFiles_clean)
import numpy as np
# column labels (D1, D2, D3, ...)
column_labels = [f'D{i+1}' for i in range(tdm.shape[1])]
# index labels (t1, t2, t3, ...)
index_names = [f'w{i+1}' for i in range(tdm.shape[0])]
# Convert to DataFrame with specified column labels and index names
tdm_2d = pd.DataFrame(tdm, index=index_names, columns=column_labels)
tdm_2d
| D1 | D2 | D3 | D4 | D5 | D6 | D7 | D8 | D9 | D10 | D11 | D12 | D13 | D14 | D15 | D16 | D17 | D18 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| w1 | 865.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| w2 | 301.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| w3 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| w4 | 3.0 | 6.0 | 3.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2.0 | 1.0 | 1.0 | 15.0 | 0.0 | 2.0 | 1.0 | 1.0 | 3.0 |
| w5 | 57.0 | 24.0 | 50.0 | 0.0 | 0.0 | 0.0 | 0.0 | 12.0 | 1.0 | 0.0 | 16.0 | 10.0 | 181.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| w25472 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| w25473 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| w25474 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| w25475 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| w25476 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
25476 rows × 18 columns
import pandas as pd
tdm_df = pd.DataFrame(tdm, index=terms, columns=nltk.corpus.gutenberg.fileids())
tdm_df
| austen-emma.txt | austen-persuasion.txt | austen-sense.txt | bible-kjv.txt | blake-poems.txt | bryant-stories.txt | burgess-busterbrown.txt | carroll-alice.txt | chesterton-ball.txt | chesterton-brown.txt | chesterton-thursday.txt | edgeworth-parents.txt | melville-moby_dick.txt | milton-paradise.txt | shakespeare-caesar.txt | shakespeare-hamlet.txt | shakespeare-macbeth.txt | whitman-leaves.txt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| emma | 865.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| jane | 301.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| austen | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| volum | 3.0 | 6.0 | 3.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2.0 | 1.0 | 1.0 | 15.0 | 0.0 | 2.0 | 1.0 | 1.0 | 3.0 |
| chapter | 57.0 | 24.0 | 50.0 | 0.0 | 0.0 | 0.0 | 0.0 | 12.0 | 1.0 | 0.0 | 16.0 | 10.0 | 181.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| glum | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| demarc | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| cumul | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| germin | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| blither | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
25476 rows × 18 columns
problems with tdm: IT IS SPARSE!!!¶
# list of tersm
print(f'There are {len(terms)} that can be used to query files.\n')
print("the terms are: ")
#terms
There are 25476 that can be used to query files. the terms are:
4. Query processing and optimization¶
Query processing steps:¶
1. Query cleaning
2. Query execution
3. Sorting by order of Relevance
4.1 Query Cleaning¶
In order to check whether a document is relevant for the query, the query terms must also be preprocessed in the same way as we did with the entire corpus. This will make sure the query terms are aligned to the processed and clean representation in the term-document matrix.
Hence we we have to do the following for query inputs as well:
- casefold each word in the query
- remove unnecessary characters in the query
- remove stop words in the query
- stem the words in the query
4.2. Query execution¶
To execute the query on the TDM, the score for each document has to be calculated. The score is thereby defined as the sum of the cells for the words of the query. As result, for each document, the number of words from the query which are contained in the document is provided.
def query_documents(tdm, terms, queryText_):
# create tokens from query text
queryText_ = queryText_.split()
# clean query tokens
queryText_clean = clean_Text(queryText_)
# document score array
idxs = [terms.index(word) for word in queryText_clean if word in terms]
documentScore = tdm[idxs].sum(axis=0)
return documentScore
# test the method
testquery = "God light heaven"
testQuery_tokens = testquery.split()
cl_query = clean_Text(testQuery_tokens)
# query clenaing
print("Query text before cleaning:", testquery)
print("Query text after cleaning:", ' '.join(cl_query))
# query execution
docScore_0 = query_documents(tdm, terms, testquery)
print(docScore_0)
Query text before cleaning: God light heaven Query text after cleaning: god light heaven [2.900e+01 2.800e+01 3.100e+01 5.732e+03 3.300e+01 6.000e+01 4.000e+00 0.000e+00 2.640e+02 7.600e+01 1.100e+02 6.400e+01 3.550e+02 8.720e+02 3.500e+01 4.500e+01 2.800e+01 2.340e+02]
Query result¶
import pandas as pd
fileNames = [title.split('.')[0] for title in nltk.corpus.gutenberg.fileids()]
df = pd.DataFrame({'Relevance score': docScore_0, 'File Names': fileNames})
df = df.set_index(["File Names"])
queryResult = df.sort_values(by='Relevance score', ascending=False)
print(f'query result for the query input: \n\n"{testquery}"')
queryResult
query result for the query input: "God light heaven"
| Relevance score | |
|---|---|
| File Names | |
| bible-kjv | 5732.0 |
| milton-paradise | 872.0 |
| melville-moby_dick | 355.0 |
| chesterton-ball | 264.0 |
| whitman-leaves | 234.0 |
| chesterton-thursday | 110.0 |
| chesterton-brown | 76.0 |
| edgeworth-parents | 64.0 |
| bryant-stories | 60.0 |
| shakespeare-hamlet | 45.0 |
| shakespeare-caesar | 35.0 |
| blake-poems | 33.0 |
| austen-sense | 31.0 |
| austen-emma | 29.0 |
| austen-persuasion | 28.0 |
| shakespeare-macbeth | 28.0 |
| burgess-busterbrown | 4.0 |
| carroll-alice | 0.0 |
4.3. Sorting by order of relevance¶
import pandas as pd
# Create a DataFrame
fileNames = [title.split('.')[0] for title in nltk.corpus.gutenberg.fileids()]
df = pd.DataFrame({'Relevance score': docScore_0, 'File Names': fileNames})
df = df.set_index('File Names')
queryResult = df.sort_values(by='Relevance score', ascending=False)
print(f'\nResult for querying: "{testquery}"\n')
queryResult
Result for querying: "God light heaven"
| Relevance score | |
|---|---|
| File Names | |
| bible-kjv | 5732.0 |
| milton-paradise | 872.0 |
| melville-moby_dick | 355.0 |
| chesterton-ball | 264.0 |
| whitman-leaves | 234.0 |
| chesterton-thursday | 110.0 |
| chesterton-brown | 76.0 |
| edgeworth-parents | 64.0 |
| bryant-stories | 60.0 |
| shakespeare-hamlet | 45.0 |
| shakespeare-caesar | 35.0 |
| blake-poems | 33.0 |
| austen-sense | 31.0 |
| austen-emma | 29.0 |
| austen-persuasion | 28.0 |
| shakespeare-macbeth | 28.0 |
| burgess-busterbrown | 4.0 |
| carroll-alice | 0.0 |
# map score with file names
def mapFilesToScore(docScore):
file_names = [title.split('.')[0] for title in nltk.corpus.gutenberg.fileids()]
fileName_score_ = {file: score for file, score in zip(file_names, docScore)}
return fileName_score_
def orderByRelevance(docScore):
# normalize scores
docScore = (docScore / np.sum(docScore)) * 100
# round it
docScore = list(np.round(docScore, 1))
# map file names to score
fileName_score_map = mapFilesToScore(docScore)
fileName_score_Sorted = dict(sorted(fileName_score_map.items(), key=lambda item: item[1], reverse=True))
return fileName_score_Sorted
# plot function
def barPlot(dict_, color_, title_='Query result of relevant documents'):
import matplotlib.pyplot as plt
plt.style.use('bmh')
labels = list(dict_.keys())
values = list(dict_.values())
plt.figure(figsize=(12, 6))
plt.bar(labels, values, color=color_)
plt.xlabel('file names')
plt.ylabel('Percentage of document\'s relevance')
plt.title(title_)
plt.xticks(rotation=90) # Rotate x-axis labels for better visibility
plt.show()
def plotlyBar(data_dict, filename='bar_plot.png', bar_color='green', title='Bar Plot Example', xaxis_title='Categories', yaxis_title='Values', plot_bgcolor='lightblue', paper_bgcolor='lightblue'):
import plotly.graph_objects as go
categories = list(data_dict.keys())
values = list(data_dict.values())
# Create a bar plot using Plotly Graph Objects
fig = go.Figure()
fig.add_trace(go.Bar(x=categories, y=values, base=0, marker=dict(color=bar_color)))
# Add annotations to the top of each bar
for category, value in zip(categories, values):
fig.add_annotation(
x=category,
y=value,
text=str(value),
showarrow=True,
arrowhead=4,
ax=0,
ay=-30
)
fig.update_layout(
title=dict(
text=title,
x=0.5 # Set x to 0.5 for centering the title
),
xaxis=dict(title=xaxis_title, tickangle=-90),
yaxis=dict(title=yaxis_title),
plot_bgcolor=plot_bgcolor,
paper_bgcolor=paper_bgcolor
)
# Save the plot as an image file
fig.write_image(filename)
fig.show()
# map files to score
testquery = "This is a Test TexT For that has to be cleaned 15%%%5615 ROBERT ;) DROP TABLE Students;"
# Snippets from austen-emma.txt, austen-persuasion.txt, and bible-kjv.txt
testquery1 = "Emma Woodhouse, clever comfortable home and happy"
testquery2 = "Walter Elliot Elizabeth, James Stevenson, South Park, Gloucester"
testquery3 = "God light heaven"
docScore_0 = query_documents(tdm, terms, testquery3)
files_ordrered = orderByRelevance(docScore_0)
plotlyBar(files_ordrered,
filename='bar_plot1.png',
bar_color='red',
title='Query result of relevant documents based term frequency',
xaxis_title='query: "God light heaven"',
yaxis_title='Relevance score',
paper_bgcolor='lightblue',
plot_bgcolor='lightblue')
filename='bar_plot.png'
def scale_tf(m):
return np.log1p(m+1)
tdm_tf = scale_tf(tdm)
testquery = "This is a Test TexT For that has to be cleaned 15%%%5615 ROBERT ;) DROP TABLE Students;"
# Snippets from austen-emma.txt, austen-persuasion.txt, and bible-kjv.txt
testquery1 = "Emma Woodhouse, clever comfortable home and happy"
testquery2 = "Walter Elliot Elizabeth, James Stevenson, South Park, Gloucester"
testquery3 = "God light heaven"
docScore_scaled = query_documents(tdm_tf, terms, testquery3)
files_ordrered_scaled = orderByRelevance(docScore_scaled)
plotlyBar(files_ordrered_scaled,
filename='bar_plot2.png',
bar_color='black',
title='Log scaled Query result',
xaxis_title='query: "God light heaven"',
yaxis_title='Relevance score',
paper_bgcolor='lightblue',
plot_bgcolor='lightblue')
5.2. Further Adjustments with Inverse-Document-Frequency¶
def scale_tfidf(m):
m = scale_tf(m)
tmp = m!=0
num_documents = m.shape[1]
idf = np.log(num_documents/m.sum(axis=1))
return (m.T * idf).T
tdm_tfidf = scale_tfidf(tdm)
testquery = "Sherlock Holmes detective stories be cleaned 15%%%5615 ROBERT ;) DROP TABLE Students;"
# Snippets from austen-emma.txt, austen-persuasion.txt, and bible-kjv.txt
testquery1 = "Emma Woodhouse, clever comfortable home and happy"
testquery2 = "Walter Elliot Elizabeth, James Stevenson, South Park, Gloucester"
testquery3 = "God light heaven"
docScore_scaled_tfidf = query_documents(tdm_tfidf, terms, testquery3)
files_ordrered_scaled_tfidf = orderByRelevance(docScore_scaled_tfidf)
#files_ordrered_scaled_tfidf
plotlyBar(files_ordrered_scaled_tfidf,
filename='bar_plot3.png',
bar_color='green',
title='Query result after log & idf scaling',
xaxis_title='query: "God light heaven"',
yaxis_title='Relevance score',
paper_bgcolor='lightblue',
plot_bgcolor='lightblue')